1 Input Settings

ampliconFastaFile <- params$ampliconFastaFile
ampliconName <- params$ampliconName
sampleSheetFile <- params$sampleSheetFile
indelsFolder <- params$indelsFolder
cutSitesFile <- params$cutSitesFile
prefix <- params$prefix
workdir <- params$workdir

inputParameterDesc <- c('Amplicon Fasta File',
                        'Amplicon Name',
                        'sgRNA Cut Sites File', 
                     'Experiment Data File',
                     'indels folder path', 
                     'Prefix for output files',
                     'Working directory'
                     )
inputParameterValues <- c(ampliconFastaFile,
                          ampliconName,
                          cutSitesFile, 
                          sampleSheetFile,
                          indelsFolder, 
                          prefix,
                          workdir)
inputSettings <- data.frame(parameters = inputParameterDesc,
                            values = inputParameterValues,
                            stringsAsFactors = FALSE)
DT::datatable(data = inputSettings,
              extensions = 'FixedColumns',
              options = list(fixedColumns = TRUE,
                         scrollX = TRUE,
                         pageLength = nrow(inputSettings),
                         dom = 't'))
sampleSheet <- read.csv(sampleSheetFile, stringsAsFactors = F)
sampleSheet <- sampleSheet[sampleSheet$amplicon == ampliconName,]
DT::datatable(data = sampleSheet,
              extensions = 'FixedColumns',
              options = list(fixedColumns = TRUE,
                         scrollX = TRUE,
                         dom = 't'))
cutSites <- read.table(cutSitesFile, stringsAsFactors = F)
colnames(cutSites) <- c('sgRNA', 'cutSite')
DT::datatable(data = cutSites)

2 Coverage Profiles

2.1 Coverage as Heatmap

# import stats tables 
coverageStats <- as.data.table(do.call(rbind, lapply(sampleSheet$sample_name, function(sampleName) {
  f <- file.path(indelsFolder, paste0(sampleName, '.coverageStats.tsv'))
  if(file.exists(f)) {
    dt <- data.table::fread(f)
    return(dt)
  }
})))
#create a matrix of bp versus samples where values are coverage
dt <- dcast.data.table(coverageStats, sample ~ bp, value.var = 'cov', fill = 0)
M <- as.matrix(dt[,-1])
rownames(M) <- dt$sample
pheatmap::pheatmap(M, 
                   cluster_rows = nrow(M) > 1, 
                   cluster_cols = F, 
                   show_colnames = F, 
                   main = paste(ampliconName, 'Coverage Profiles'))

2.2 Coverage as Line Plot

p <- ggplot(coverageStats, aes(x = bp, y = cov, group = sample), height = 500) + 
  geom_line(aes(color = sample), show.legend = F) +   
  theme_bw() +  
  labs(x = 'base position', y = 'Number of Reads', title = paste(ampliconName, 'Coverage Profiles'))

ggplotly(p)

3 Indel Score Profiles

plotScores <- function(indelsFolder, sampleName, sampleSheet, cutSites) {
  f <- file.path(indelsFolder, paste0(sampleName, '.coverageStats.tsv'))
  dt <- data.table::fread(f)

  sgRNAs <- unlist(strsplit(x = sampleSheet[sampleSheet$sample_name == sampleName,]$sgRNA_ids, 
                     split = ':'))
  
  sgRNAs <- merge(data.frame('sgRNA' = sgRNAs, 'sample_name' = sampleName), cutSites, by = 'sgRNA')

  mdt <- melt(dt[,-c('ins', 'del', 'indel', 'cov')], id.vars = c('bp', 'sample', 'seqname'))
  
  p <- ggplot(mdt, aes(x = bp, y = value, group = variable)) +
    geom_line(aes(color = variable)) + labs(title = sampleName)
  if(nrow(sgRNAs) > 0) {
    p <- p + geom_vline(data = sgRNAs, aes(xintercept = cutSite, 
                                           color = sgRNA), linetype = 'dotted')
  }
  p <- p + 
    theme_bw(base_size = 14) + 
    scale_y_continuous(labels = scales::percent)
  return(p)
}

plots <- lapply(sampleSheet$sample_name, function(sampleName) {
  plotScores(indelsFolder = indelsFolder, 
             sampleName = sampleName, 
             sampleSheet = sampleSheet, 
             cutSites = cutSites)
})
names(plots) <- sampleSheet$sample_name
if(length(plots) > 10) {
  cat("## Indel Score Profiles {.tabset .tabset-dropdown}\n\n")
} else {
  cat("## Indel Score Profiles {.tabset}\n\n")
}

3.1 Indel Score Profiles

out = NULL
for (n in names(plots)) {
  p <- ggplotly(plots[[n]])
  out = c(out, knitr::knit_expand(text='### {{n}} \n\n {{p}} \n\n'))
}

3.1.1 lin41RNA_pool1

4 sgRNA efficiencies at cut sites

# this is a table for all samples versus all cut sites in the amplicon
# not all sample x sgRNA combinations are true, 
# but still they are also computed to serve as a negative control
cutSiteStats <- as.data.table(do.call(rbind, lapply(sampleSheet$sample_name, function(sampleName) {
  f <- file.path(indelsFolder, paste0(sampleName, '.indel_stats_at_cutsites.tsv'))
  if(file.exists(f)) {
    dt <- fread(f)
    return(dt)
  }
})))

# match samples to the actual sgRNA guides used in that sample
sampleGuides <- lapply(sampleSheet$sample_name, function(s) {
  sgRNAs <- unlist(strsplit(x = sampleSheet[sampleSheet$sample_name == s,]$sgRNA_ids, 
                   split = ':'))
})
names(sampleGuides) <- as.character(sampleSheet$sample_name)

cutSiteStats$sampleMatchesGuide <- as.factor(apply(cutSiteStats, 1, function(x) {
  s <- as.character(x[['sample']])
  g <- as.character(x[['sgRNA']])
  return(g %in% sampleGuides[[s]])
}))

plots <- lapply(as.vector(unique(cutSiteStats$sgRNA)), function(sg) {
  dt <- cutSiteStats[sgRNA == sg & sampleMatchesGuide == TRUE]
  if(nrow(dt) > 0) {
    p <- ggplot(dt, aes(x = sample, y = indelEfficiency)) + 
      geom_bar(stat = 'identity', aes(fill = sample)) + 
      labs(title = sg) + coord_flip()
    return(p)
  }
})
names(plots) <- as.vector(unique(cutSiteStats$sgRNA))
if(length(plots) > 10) {
  cat("## Indel Stats At Cut Sites {.tabset .tabset-dropdown}\n\n")
} else {
  cat("## Indel Stats At Cut Sites {.tabset}\n\n")
}

4.1 Indel Stats At Cut Sites

out = NULL
for (n in names(plots)) {
  if(!is.null(plots[[n]])) {
  p <- ggplotly(plots[[n]])
  out = c(out, knitr::knit_expand(text='### {{n}} \n\n {{p}} \n\n'))
  } else {
    out = c(out, knitr::knit_expand(text='### {{n}} \n\n No indel data found\n\n'))
  }
}

4.1.1 lin-41_CDS_sg1

No indel data found

4.1.2 lin-41_CDS_sg2

No indel data found

4.1.3 lin-41_UTR_sg1

4.1.4 lin-41_UTR_sg2

No indel data found

4.1.5 lin-41_UTR_sg3

No indel data found

4.1.6 lin-41_UTR_sg4

4.1.7 lin-41_UTR_sg5

No indel data found

4.1.8 lin-41_UTR_sg7

No indel data found

4.1.9 lin-41_UTR_sg8

No indel data found

4.1.10 lin-41_UTR_sg10

4.1.11 lin-41_UTR_sg11

No indel data found

4.1.12 lin-41_UTR_sg14

No indel data found

4.1.13 lin-41_UTR_sg15

No indel data found

4.1.14 lin-41_UTR_sg16

4.1.15 lin-41_UTR_sg20

No indel data found

4.1.16 lin-41_UTR_sg22

4.1.17 lin-41_UTR_sg24

No indel data found

4.1.18 lin-41_UTR_sg26

No indel data found

4.1.19 lin-41_UTR_sg27

No indel data found

5 sgRNA efficiencies at cut sites - version 2

dts <- crosstalk::SharedData$new(cutSiteStats)

bscols(widths = c(4,8),
  list(
    filter_checkbox("sampleMatchesGuide", "Own Guide", dts, ~factor(sampleMatchesGuide), inline = TRUE),
    filter_select( "sgRNA", "Select sgRNA", dts, ~factor(sgRNA), allLevels = FALSE, multiple = FALSE),
    filter_select( "sample", "Select samples", dts, ~factor(sample), allLevels = FALSE, multiple = TRUE)
    ),
  plotly::plot_ly(data = dts,
                  x = ~ sgRNA,
                  y = ~ indelEfficiency,
                  group = ~sgRNA,
                  color = ~sample,
                  type = 'bar',
                  text = ~sample,
                  height = 500)  %>%
    layout(showlegend = FALSE, xaxis = list(showticklabels = TRUE), barmode = 'group')
)

6 Indel diversity at cutsites

# indels: a data.table object with minimal columns: start, end, 
# cutSites: a data.frame/data.table object with minimal columns: sgRNA and cutsite
# return: data.frame (nrow = nrow(indels), columns are sgRNA ids, 
#         values are 1 if indel overlaps cutsite, otherwise 0. 
overlapCutSites <- function(indels, cutSites, extend = 3) {
  target <- IRanges(start = cutSites$cutSite - extend, 
                    end = cutSites$cutSite + extend, 
                    names = cutSites$sgRNA)
  query <- IRanges(start = indels$start, end = indels$end)
  
  startOverlaps <- as.data.table(findOverlaps(start(query), target, type = 'any'))
  endOverlaps <- as.data.table(findOverlaps(end(query), target, type = 'any'))
  
  overlaps <- merge(startOverlaps, endOverlaps, by = c('queryHits', 'subjectHits'), all = TRUE)
  
  M <- matrix(data = rep(0, nrow(indels) * length(target)), 
              nrow = nrow(indels), ncol = length(target))
  colnames(M) <- names(target)

  M[as.matrix(overlaps)] <- 1
  
  return(M)
}

#import all detected indels (unfiltered) from all samples
allIndels <- as.data.table(do.call(rbind, lapply(sampleSheet$sample_name, function(sampleName) {
  f <- file.path(indelsFolder, paste0(sampleName, '.indels.unfiltered.tsv'))
  if(file.exists(f)) {
    dt <- fread(f)
    return(dt)
  }
})))

#summarize indels by read support
indelSummary <- allIndels[,length(unique(readID)), by = c('seqname', 'sample', 'start', 'end', 'indelType')]
colnames(indelSummary)[6] <- 'ReadSupport'

dt <- cbind(indelSummary, 
            overlapCutSites(indels = indelSummary, 
                            cutSites = cutSites, extend = 3))

mdt <- melt(dt, id.vars = colnames(indelSummary))

#make plots for different score thresholds, facet by samples, guides, and indel types
supportThresholds <- c(0, 1, 5, 10)
plots <- lapply(supportThresholds, function(s) {
  df <- mdt[ReadSupport > s & value == 1,
            length(seqname), 
            by = c('sample', 'variable', 'indelType')]
  if(nrow(df) > 0) {
    p <- ggplot(data = df, aes(x = sample, y = variable)) + 
    #geom_point(aes(size = V1)) + 
    geom_tile(aes(fill = V1)) 
    if(nrow(df) < 250) {
      p <- p + 
        geom_text(aes(label = V1), color = 'white') + facet_wrap(~ indelType, ncol = 1) 
    }
    p <- p + theme_bw() + 
    theme(axis.text.x = element_text(angle = 90), legend.position = 'none')
    return(p)
  } else {
    NULL
  }
})
names(plots) <- paste("Read Support >",supportThresholds)
for (i in 1:length(plots)) {
  cat("## ",names(plots)[i],"\n\n")
  p <- plots[[i]]
  if(!is.null(p)) {
    print(plots[[i]])
  } else {
    cat("No genotypes detected")
  }
  cat("\n\n")
}

6.1 Read Support > 0

6.2 Read Support > 1

No genotypes detected

6.3 Read Support > 5

No genotypes detected

6.4 Read Support > 10

No genotypes detected

7 R Session Information

print(sessionInfo())
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.4 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /home/edanner/anaconda3/lib/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    methods   stats     graphics  grDevices utils    
## [8] datasets  base     
## 
## other attached packages:
##  [1] bindrcpp_0.2.2       crosstalk_1.0.0      plotly_4.7.1        
##  [4] DT_0.4               rtracklayer_1.38.3   GenomicRanges_1.30.3
##  [7] GenomeInfoDb_1.14.0  IRanges_2.12.0       S4Vectors_0.16.0    
## [10] BiocGenerics_0.24.0  data.table_1.11.4    ggrepel_0.8.0       
## [13] ggplot2_3.0.0        knitr_1.20          
## 
## loaded via a namespace (and not attached):
##  [1] Biobase_2.38.0             httr_1.3.1                
##  [3] tidyr_0.8.1                jsonlite_1.5              
##  [5] viridisLite_0.3.0          shiny_1.1.0               
##  [7] assertthat_0.2.0           GenomeInfoDbData_1.0.0    
##  [9] Rsamtools_1.30.0           yaml_2.1.19               
## [11] pillar_1.2.3               backports_1.1.2           
## [13] lattice_0.20-35            glue_1.2.0                
## [15] digest_0.6.15              RColorBrewer_1.1-2        
## [17] promises_1.0.1             XVector_0.18.0            
## [19] colorspace_1.3-2           htmltools_0.3.6           
## [21] httpuv_1.4.4.2             Matrix_1.2-14             
## [23] plyr_1.8.4                 XML_3.98-1.11             
## [25] pkgconfig_2.0.1            pheatmap_1.0.10           
## [27] zlibbioc_1.24.0            purrr_0.2.5               
## [29] xtable_1.8-2               scales_0.5.0              
## [31] later_0.7.3                BiocParallel_1.12.0       
## [33] tibble_1.4.2               withr_2.1.2               
## [35] SummarizedExperiment_1.8.1 lazyeval_0.2.1            
## [37] magrittr_1.5               mime_0.5                  
## [39] evaluate_0.10.1            tools_3.4.3               
## [41] matrixStats_0.53.1         stringr_1.3.1             
## [43] munsell_0.5.0              DelayedArray_0.4.1        
## [45] Biostrings_2.46.0          compiler_3.4.3            
## [47] rlang_0.2.1                grid_3.4.3                
## [49] RCurl_1.95-4.10            htmlwidgets_1.2           
## [51] bitops_1.0-6               labeling_0.3              
## [53] rmarkdown_1.10             gtable_0.2.0              
## [55] R6_2.2.2                   GenomicAlignments_1.14.2  
## [57] dplyr_0.7.6                bindr_0.1.1               
## [59] rprojroot_1.3-2            stringi_1.2.3             
## [61] Rcpp_0.12.17               tidyselect_0.2.4